TFLearn [Subordinate Clause] Fragment Detection

This notebook is based off the original fragment detection notebook, but specific to detection of participle phrase fragments. As our trainin g data we will use a datafile of 50,000+ sentences with a subordinate clause contained in them at the begining, middle, or end of the sentence, and 50,000+ subordinate clauses extracted from the sentences -- these raw subordinate clauses will always be fragments. The labels will be either a 1 or 0, where 1 indicates a subordinate clause fragment and 0 indicates that it is NOT a subordinate clause fragment (it does not mean it is a sentence).

Install Dependencies



In [ ]:

    
import pandas as pd
import numpy as np
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical
import spacy
nlp = spacy.load('en')
import re
from nltk.util import ngrams, trigrams
import csv

Create combined data



In [ ]:

    
import subprocess

subprocess.Popen("python combine.py childrens_fragments".split(), cwd='../data/fragments/subordinate-clauses')

Load Datafiles



In [ ]:

    
texts = []
labels = []

with open("../data/fragments/subordinate-clauses/childrens_fragments.combined.txt","r") as f:
    for i, sentence_or_fragment in enumerate(f):
        if i % 2 == 0:
            labels.append(0)
        else:
            labels.append(1)
        texts.append(sentence_or_fragment.strip())
        
print(texts[-10:])

Shuffle the data



In [ ]:

    
import random

combined = list(zip(texts,labels))
random.shuffle(combined)

texts[:], labels[:] = zip(*combined)
print(texts[-10:])
print(labels[-10:])

Get parts of speech for text string



In [ ]:

    
def textStringToPOSArray(text):
    doc = nlp(text)
    tags = []
    for word in doc:
        tags.append(word.tag_)
    return tags

textStringToPOSArray(texts[3])

Get POS trigrams for a text string



In [ ]:

    
def find_ngrams(input_list, n):
  return zip(*[input_list[i:] for i in range(n)])

def getPOSTrigramsForTextString(text):
    tags = textStringToPOSArray(text)
    tgrams = list(trigrams(tags))
    return tgrams

print("Text: ", texts[3], labels[3])
getPOSTrigramsForTextString(texts[3])

Turn Trigrams into Dict keys



In [ ]:

    
def trigramsToDictKeys(trigrams):
    keys = []
    for trigram in trigrams:
        keys.append('>'.join(trigram))
    return keys

print(texts[2])
print(trigramsToDictKeys(getPOSTrigramsForTextString(texts[2])))



In [ ]:

    
from collections import Counter

c = Counter()

for textString in texts:
    c.update(trigramsToDictKeys(getPOSTrigramsForTextString(textString)))

total_counts = c

print("Total words in data set: ", len(total_counts))



In [ ]:

    
vocab = sorted(total_counts, key=total_counts.get, reverse=True)
print(vocab[:60])



In [ ]:

    
print(vocab[-1], ': ', total_counts[vocab[-1]])

Take the trigrams and index them



In [ ]:

    
word2idx = {n: i for i, n in enumerate(vocab)}## create the word-to-index dictionary here
print(word2idx)



In [ ]:

    
def textToTrigrams(text): 
    return trigramsToDictKeys(getPOSTrigramsForTextString(text))

def text_to_vector(text):
    wordVector = np.zeros(len(vocab))
    for word in textToTrigrams(text):
        index = word2idx.get(word, None)
        if index != None:
            wordVector[index] += 1
    return wordVector



In [ ]:

    
text_to_vector('Until I died, I laughed')[:65]



In [ ]:

    
word_vectors = np.zeros((len(texts), len(vocab)), dtype=np.int_)
for ii, text in enumerate(texts):
    word_vectors[ii] = text_to_vector(text)



In [ ]:

    
# Printing out the first 5 word vectors
word_vectors[:5, :23]

Chunking the data for TF



In [ ]:

    
records = len(labels)
test_fraction = 0.9

train_split, test_split = int(records*test_fraction), int(records*(1-test_fraction))
print(train_split, test_split)
trainX, trainY = word_vectors[:train_split], to_categorical(labels[:train_split], 2)
testX, testY = word_vectors[test_split:], to_categorical(labels[test_split:], 2)



In [ ]:

    
trainX[-1], trainY[-1]



In [ ]:

    
len(trainY), len(testY), len(trainY) + len(testY)

Setting up TF



In [ ]:

    
# Network building
def build_model():
    # This resets all parameters and variables, leave this here
    tf.reset_default_graph()
    
    #### Your code ####
    net = tflearn.input_data([None, len(vocab)])                          # Input
    net = tflearn.fully_connected(net, 200, activation='ReLU')      # Hidden
    net = tflearn.fully_connected(net, 25, activation='ReLU')      # Hidden
    net = tflearn.fully_connected(net, 2, activation='softmax')   # Output
    net = tflearn.regression(net, optimizer='sgd', learning_rate=0.1, loss='categorical_crossentropy')
    model = tflearn.DNN(net)

    return model



In [ ]:

    
len(vocab)

Initialize



In [ ]:

    
model = build_model()

Training



In [ ]:

    
# Training
model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=128, n_epoch=50)



In [ ]:

    
# Testing
predictions = (np.array(model.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)



In [ ]:

    
w = csv.writer(open("../models/subordinatevocabindex.csv", "w"))
for key, val in word2idx.items():
    w.writerow([key, val])



In [ ]:

    
model.save("../models/subordinate_model.tfl")

Playground



In [ ]:

    
def test_sentence(sentence):
    positive_prob = model.predict([text_to_vector(sentence)])[0][1]
    print('Is this a subordinate clause fragment?\n {}'.format(sentence))
    print('P(positive) = {:.3f} :'.format(positive_prob), 
          'Yes' if positive_prob > 0.5 else 'No')



In [ ]:

    
test_sentence("Until the end of time.")



In [ ]:

    
test_sentence("She would love him until the end of time.")



In [ ]:

    
test_sentence("Until the end of time, she would love him.")



In [ ]:

    
test_sentence("Ryan, in the dead of night, arrived on the banks of the Delaware.")



In [ ]:

    
test_sentence("In the dead of night.")



In [ ]:

    
test_sentence("In the dead of night, Ryan arrived on the banks of the Delaware.")



In [ ]:

    
test_sentence("Ryan arrived on the banks of the Delaware in the dead of night.")



In [ ]:

    
test_sentence("At the end of her rope.")



In [ ]:

    
test_sentence("Cindy was at the end of her rope.")



In [ ]:

    
test_sentence("Cindy was done, at the end of her rope.")



In [ ]:

    
test_sentence("On the iron throne, Joffry looked rather fat.")



In [ ]:

    
test_sentence("On the iron throne.")



In [ ]:

    
test_sentence("Unless Christine finishes her calculus homework.")



In [ ]:

    
test_sentence("Unless Christine finishes her calculus homework, she will have to suffer Mr. Nguyen's wrath in class tomorrow.")



In [ ]:

    
test_sentence("Because her best friend Giselle insisted on gossiping during their study session the night before.")



In [ ]:

    
test_sentence("While Bailey slept on the sofa in front of the television.")



In [ ]:

    
test_sentence("While Bailey slept on the sofa in front of the television, Samson, the family dog, gnawed on the leg of the coffee table.")



In [ ]:

    
test_sentence("Tanya did poorly on her history exam because her best friend Giselle insisted on gossiping during their study session the night before.")

Save the vocab



In [ ]:

    
vocab



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]: